# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #  
#
#' @title  Create dataset combining quarterly anti-elite strategy estimates with
#'          other party system status, polls, and coalition inclusion 
#'          probabilities, indicators
#' @author Hauke Licht
#
# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #

# setup ----

# load packages
library(readr)
library(dplyr)
library(lubridate)
library(purrr)


# define data paths
base_path <- file.path(".")
data_path <- file.path(base_path, "data")
exdata_path <- file.path(data_path, "exdata")
input_path <- file.path(data_path, "input")
output_path <- file.path(data_path, "output")

# load data ----

parl_configs <- read_rds(file.path(input_path, "parl_configs_with_party_system_status.rds"))
ts <- read_rds(file.path(output_path, "parl_party_tweets_elitecriticism_fixed_window_timeseries.rds"))

party_codes <- read_csv(file.path(exdata_path, "party_codes_mapping.csv"))

# load quarter averages of (smoothed) poll of polls ----

fp <- file.path(exdata_path, "polls_quarter_averages.csv")
polls <- read_csv(fp, col_types = "ccdcdcic")

# load and prepare coalition inclusion probabilities (CIP) data ---

fp <- file.path(exdata_path, "coalition_inclusion_probabilities.csv")
cip_data <- read_csv(fp, col_types = "cciDccidddd")

# aggregate monthly CIP data at quarter
cip_data_quarterly <- cip_data %>% 
  mutate(quarter = sprintf("%d-%02d", year, quarter(date))) %>% 
  filter(!is.na(pr_ingov_mean)) %>% 
  group_by(country_iso3c, quarter, party_id = parlgov_id) %>% 
  summarise(
    cip_mean = mean(pr_ingov_mean)
    , n_obs = n_distinct(date)
  )

# should be =< 3 (because one quarter has at most 3 months)
table(cip_data_quarterly$n_obs)
cip_data_quarterly$n_obs <- NULL
table(is.na(cip_data_quarterly$cip_mean))

# Bring everything together ----

## 1) add party codes to quarterly data ----

dat <- ts$quarter %>% 
  select(-user_ids, -first_date) %>% 
  # add party-facts and CMP party IDs
  left_join(
    transmute(party_codes, country_iso3c, party_id, party_name_short, party_id_partyfacts, party_id_cmp)
    , by = c("country_iso3c", "party_id", "party_name_short")
  ) 

## 2) add parliamentary configurations data ----

dat <- dat %>% 
  left_join(
    mutate(
      parl_configs
      , party_id_manual = NULL
      , cabinet_formation_period = !cabinet_formation
      , cabinet_formation = NULL
    )
    , by = c(
      "country_iso3c"
      , "party_id", "party_name_short"
      , "pre_elec_period", "election_date", "election_date_next", "elec_period_sdate", "elec_period_edate"
      , "parl_config_sdate", "parl_config_edate"
    )
  ) 

## 3) join quarterly poll-of-polls aggregates ----

dat <- dat %>% 
  left_join(
    transmute(polls, country_iso3c, quarter = sub("-", "-0", quarter), party_id_pgv, party_id_pf, spolls_mean, matched = TRUE)
    , by = c("country_iso3c", "quarter", "party_id" = "party_id_pgv", "party_id_partyfacts" = "party_id_pf")
  ) 

# where is polling data missing?
dat %>% 
  group_by(country_iso3c) %>% 
  summarise(prop_polls_missing = mean(is.na(matched)))
# mainly AUS, CAN, NZL (not in coverage of Kaiser & Rehmert's data)
dat$matched <- NULL

## 4) join quarterly aggregates of coalition inclusion probability (CIP) estimates ----

# first, fix some issues:
# 1. German Greens should have ParlGov ID 772, not 255
cip_data_quarterly$party_id[cip_data_quarterly$party_id == 255] <- 772

dat <- left_join(dat, cip_data_quarterly, by = c("country_iso3c", "quarter", "party_id"))

# report missingness 
dat %>% 
  group_by(country_iso3c) %>% 
  summarise(prop_cip_missing = mean(is.na(cip_mean)))
# again, AUS, CAN, NZL (not in coverage of Kaiser & Rehmert's data)

# write to disk ----

fp <- file.path(output_path, "party_antielite_strategies_quarterly_data.rds")
if (!file.exists(fp))
  write_rds(dat, fp)

# create codebook ----

codebook <- tribble(
  ~variable, ~description,
  "country_iso3c", "country ISO-3-character code",
  "party_id", "ParlGov party ID",
  "party_name_short", "ParlGov party abbreviation",
  "quarter", "year quarter indicator (YYYY-1 for first three months, YYYY-2 for months 4-6, etc.)",
  "n_tweets", "number of tweets in party--quarter",
  "n_elitecriticism", "number of tweets in party--quarter with Pr(elitecriticism) ≥ .5",
  "prop_elitecriticism", "proportion of elite-critical tweets in party--quarter",
  "mean_prob_elitecriticism", "quarterly mean of Pr(elitecriticism) in party--quarter",
  "pre_elec_period", "boolean indicator, TRUE if tweet data comes from a period where the party had not been in parliament but was a bout to enter in the next election",
  "election_id", "ParlGov election ID (not available for some manually added elections)",
  "election_date", "election date for electoral period corresponding to quarter",
  "election_date_next", "data of upcoming election in electoral period corresponding to quarter",
  "elec_period_sdate", "start date of electoral period corresponding to quarter",
  "elec_period_edate", "end date of electoral period corresponding to quarter",
  "parl_config_sdate", "start date of parliamentary configuration corresponding to quarter",
  "parl_config_edate", "end date of parliamentary configuration corresponding to quarter",
  "party_id_partyfacts", "party-facts party ID",
  "party_id_cmp", "CMP party ID",
  "vote_share", "vote share in election period corresponding to quarter",
  "seats", "party's number of seats in election period corresponding to quarter",
  "seats_total", "total number of seats in election period corresponding to quarter",
  "party_in_parliament", "boolean indicator, TRUE if party is represented in parliament in parliamentary configuration corresponding to quarter",
  "parl_entry", "boolean indicator, TRUE if party has just (re-)entered parliament in election period corresponding to quarter",
  "parl_exit", "boolean indicator, TRUE if party has lost parliamentary representation in election period corresponding to quarter",
  "cabinet_id", "parlGov cabinet ID of parliamentary configuration corresponding to quarter",
  "party_in_cabinet", "boolean indicator, TRUE if party is cabinet member in parliamentary configuration corresponding to quarter",
  "party_pm", "boolean indicator, TRUE if party sponsors the prime minister in parliamentary configuration corresponding to quarter",
  "is_chal", "boolean indicator, TRUE if party is challenger in parliamentary configuration corresponding to quarter",
  "is_mop", "boolean indicator, TRUE if party is mainstream opposition party in parliamentary configuration corresponding to quarter",
  "is_govt", "boolean indicator, TRUE if party is government party in parliamentary configuration corresponding to quarter",
  "party_system_status", "categorical indicator specifying party's party system status in parliamentary configuration corresponding to quarter",
  "cabinet_formation_period", "boolean indicator, TRUE if parliamentary configuration corresponding to quarter is a period where a cabinet had not yet been formed",
  "spolls_mean", "quarterly mean of smoothed poll-of-polls estimates",
  "cip_mean", "quarterly mean of coalition inclusion probability (CIP) estimates"
)

fp <- sub("\\.rds", "-codebook.tab", fp)
if (!file.exists(fp))
  write_tsv(codebook, fp)

